###############################################################
# 1) Male and female graduate enrollment (1967 - 2015)
###############################################################

# Creating a simplified data frame excluding non profit and for profit schools as well as irrelavent totals
grad_enroll_ftf <- grad_enroll %>% 
  select(Year, full_time_m, full_time_f, part_time_m, part_time_f)

#creating a linear model for male graduate enrollment
male_grad <- lm(grad_enroll$total_males ~ grad_enroll$Year)

summary(male_grad)
## 
## Call:
## lm(formula = grad_enroll$total_males ~ grad_enroll$Year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96461 -34861 -12841  51876  95766 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -17112153    1087024  -15.74   <2e-16 ***
## grad_enroll$Year      9069        546   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared:  0.8545, Adjusted R-squared:  0.8514 
## F-statistic:   276 on 1 and 47 DF,  p-value: < 2.2e-16
plot(male_grad)

#creating a linear model for female graduate enrollment
female_grad <- lm(grad_enroll$total_females ~ grad_enroll$Year)

summary(female_grad)
## 
## Call:
## lm(formula = grad_enroll$total_females ~ grad_enroll$Year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89397 -48101  -7633  45267 129727 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -5.896e+07  1.161e+06  -50.77   <2e-16 ***
## grad_enroll$Year  3.013e+04  5.832e+02   51.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9823 
## F-statistic:  2669 on 1 and 47 DF,  p-value: < 2.2e-16
plot(female_grad)

#create a graph of the models
enroll_graph <- grad_enroll %>% 
  ggplot(aes(x = Year, y = total_males))+
  geom_point(aes(color = "total_males"))+
  geom_smooth(aes(x = Year, y = total_males),method = lm, se = TRUE, size = 0.5, color = "gray20") + #plots the linear model with a confidence interval (se)
  geom_point(aes(x = Year, y = total_females, color = "total_females")) +
  geom_smooth(aes(x = Year, y = total_females),method = lm, se = TRUE, size = 0.5, color = "gray20")+
  theme_classic() +
  theme(legend.title=element_blank()) +
  scale_color_manual(" ", breaks = c("total_males", "total_females"), values = c("total_males" = "royalblue1", "total_females" = "palevioletred1"), labels=c("Males", "Females")) +
  labs(x = "\nYear", y = "Total Enrollment\n", title = "Male and Female Graduate Enrollment (1967-2015)\n") +
  scale_x_continuous(expand = c(0,0), limits = c(1967, 2015)) +
  theme(text = element_text(family = "Times New Roman")) + 
  theme(plot.title = element_text(hjust = 0.5, size = 13)) +
  theme(axis.title = element_text(size = 12), axis.text = element_text(size = 10))

enroll_graph

###############################################################
# 2) Shifts in female PhD recipients by field (1985, 2000, and 2015)
###############################################################

# First creating a data frame with just the 5 fields in question and just the 3 years of interest.
phds_summary <- phds %>%
  select("year", "physci_f", "engineer_f", "ed_f", "humart_f") %>% 
  filter(year == "1985" | year == "2000" | year == "2015") %>% 
  select("physci_f", "engineer_f", "ed_f", "humart_f") # I realize this line of code is redundant, but just put it in for my own understanding of the order in which things occurred.
rownames(phds_summary) <- c("1985", "2000", "2015")
## Warning: Setting row names on a tibble is deprecated.
#maybe a chisquare tests for proportions of females in each field by year
phds_chi <- chisq.test(phds_summary)
phds_chi
## 
##  Pearson's Chi-squared test
## 
## data:  phds_summary
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
phds_prop <- prop.table(as.matrix(phds_summary), 1)

phds_summary2 <- phds %>%
  select("year", "physci_f", "engineer_f", "ed_f", "humart_f") %>% 
  filter(year == "1985" | year == "2000" | year == "2015") 

phds_summary3<-melt(phds_summary2, id.vars = 'year')

phds_graph<- phds_summary3 %>% 
  ggplot(aes(fill = variable, y = value, x = year)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_x_continuous(breaks = c(1985, 2000, 2015)) +
  scale_y_continuous(expand = c(0,0)) +
  theme_classic() +
  labs(x = "\nYear", y = "Proportion\n", title = "Proportion of Female PhD Recipients\n") +
  theme(text = element_text(family = "Times New Roman")) + 
  theme(plot.title = element_text(hjust = 0.5, size = 13)) +
  theme(axis.title = element_text(size = 12), axis.text = element_text(size = 10)) +
  scale_fill_manual(values = c("lightpink1", "palevioletred1", "violetred", "maroon4"), labels = c("Physical and Earth Sciences", "Engineering", "Education", "Humanities and Arts")) +
  theme(legend.title=element_blank())


phds_graph

# p-value < 0.001 therefore there is a significant association between the year a degree was awarded and the number of PhDs awarded to women in that year (X^2 = 2073, *p* , 0.001)
###############################################################
# 3) Male and female salaries for starting postdoctoral and other employment positions (2015)
###############################################################

#2 mann whitney u tests (one for median postdoc salary male vs female, one for median other employment male vs female)

#explore data for posdoc salaries
male_post_sal <- ggplot(med_salary, aes(x = postdoc_m)) +
  geom_histogram(binwidth = 5000, aes(color = "black"))
male_post_sal #not normally distributed

# Checking the qq plot for this distribution
male_post_qq <- ggplot(med_salary, aes(sample = postdoc_m)) +
  geom_qq()
male_post_qq

#Looking like it's potentially linear

female_post_sal <- ggplot(med_salary, aes(x = postdoc_f)) +
  geom_histogram(binwidth = 5000, aes(color = "black"))
female_post_sal #not normally distributed

# Checking the qq plot for this distribution
female_post_qq <- ggplot(med_salary, aes(sample = postdoc_f)) +
  geom_qq()
female_post_qq

# NOT looking linear

#want comparison of means so do Mann Whitney U
#H0: Ranks are equal
#HA: ranks are different
post_sal_test <- wilcox.test(med_salary$postdoc_f, med_salary$postdoc_m, paired = TRUE)
## Warning in wilcox.test.default(med_salary$postdoc_f,
## med_salary$postdoc_m, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_salary$postdoc_f,
## med_salary$postdoc_m, : cannot compute exact p-value with zeroes
post_sal_test
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  med_salary$postdoc_f and med_salary$postdoc_m
## V = 16.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
med_sal_2<-melt(med_salary, id.vars = 'field')

post_sal_graph <- med_sal_2 %>%  
  filter(variable == "postdoc_m" | variable == "postdoc_f") %>% 
  ggplot(aes(x = field, y = value)) +
  geom_col(aes(fill = variable), position = "dodge") +
  scale_fill_manual(values = c("royalblue1", "palevioletred1"), labels = c("Male", "Female")) +
  theme_classic() +
  coord_flip() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)) +
  theme(legend.title=element_blank())+
  labs(x = "\nField", y = "Median Salary\n", title = "Male and Female Median Postdoc Salaries 2015\n") +
  theme(text = element_text(family = "Times New Roman")) + 
  theme(plot.title = element_text(hjust = 0.5, size = 13)) +
  theme(axis.title = element_text(size = 12)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse="\n"))

post_sal_graph

#explore data for employement salaries
male_employ_sal <- ggplot(med_salary, aes(x = employment_m)) +
  geom_histogram(binwidth = 15000)
male_employ_sal #maybe normally distributed?

female_employ_sal <- ggplot(med_salary, aes(x = employment_f)) +
  geom_histogram(binwidth = 15000)
female_employ_sal #maybe normally distributed?

employ_sal_test <- wilcox.test(med_salary$employment_f, med_salary$employment_m, paired = TRUE)
## Warning in wilcox.test.default(med_salary$employment_f,
## med_salary$employment_m, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_salary$employment_f,
## med_salary$employment_m, : cannot compute exact p-value with zeroes
employ_sal_test
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  med_salary$employment_f and med_salary$employment_m
## V = 4, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
employ_sal_graph <- med_sal_2 %>%  
  filter(variable == "employment_m" | variable == "employment_f") %>% 
  ggplot(aes(x = field, y = value)) +
  coord_flip() +
  geom_col(aes(fill = variable), position = "dodge") +
  scale_fill_manual(values = c("royalblue1", "palevioletred1"), labels = c("Male", "Female")) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)) +
  theme(legend.title=element_blank())+
  labs(x = "\nField", y = "Median Salary\n", title = "Male and Female Median non Postdoc Salaries 2015\n") +
  theme(text = element_text(family = "Times New Roman")) + 
  theme(plot.title = element_text(hjust = 0.5, size = 13)) +
  theme(axis.title = element_text(size = 12)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse="\n"))

employ_sal_graph

###############################################################
# 4) Exploring academic salaries for professors in U.S. colleges
###############################################################

# Find a good multiple linear regression model with the output being salary, and the variables (some combination of) sex, discipline, rank, years of service, years since phd 

# Some data exploration 
#plot of salary v years service
salary_plot <- ggplot (salary_data, aes(x=salary, y=yrs.service))+
  geom_point(aes(color=sex), alpha=0.6)
salary_plot

#plot of salary v years service
salary_plot2 <- ggplot (salary_data, aes(x=salary, y=yrs.since.phd))+
  geom_point(aes(color=sex), alpha=0.6)
salary_plot2

#plot of salary by faculty rank
facultyrank_plot <- ggplot (salary_data, aes(x=faculty.rank, y=salary))+
  geom_point(aes(color=sex))
facultyrank_plot

# well this doesn't seem to be saying much, not seeing any glaring trends here

# summary table looking at salary by sex and faculty position... interesting? maybe.
avg_salary_sex <- salary_data %>% 
  group_by(sex, faculty.rank) %>% 
  summarize(mean= mean(salary),
            count=n())
avg_salary_sex
## # A tibble: 6 x 4
## # Groups:   sex [?]
##   sex    faculty.rank    mean count
##   <chr>  <chr>          <dbl> <int>
## 1 Female AssocProf     88513.    10
## 2 Female AsstProf      78050.    11
## 3 Female Prof         121968.    18
## 4 Male   AssocProf     94870.    54
## 5 Male   AsstProf      81311.    56
## 6 Male   Prof         127121.   248
# linear model with ALL possible variables

salary_lm1 <- lm (salary ~ sex+faculty.rank+discipline+yrs.since.phd+yrs.service, data=salary_data)

summary(salary_lm1)
## 
## Call:
## lm(formula = salary ~ sex + faculty.rank + discipline + yrs.since.phd + 
##     yrs.service, data = salary_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           78862.8     4990.3  15.803  < 2e-16 ***
## sexMale                4783.5     3858.7   1.240  0.21584    
## faculty.rankAsstProf -12907.6     4145.3  -3.114  0.00198 ** 
## faculty.rankProf      32158.4     3540.6   9.083  < 2e-16 ***
## disciplineB           14417.6     2342.9   6.154 1.88e-09 ***
## yrs.since.phd           535.1      241.0   2.220  0.02698 *  
## yrs.service            -489.5      211.9  -2.310  0.02143 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16
# taking out yrs.since.phd, since it likely pretty much same as years.service; also makes no sense that the years of service would have a negative coefficient

salary_lm2 <- lm(salary ~ sex+faculty.rank+discipline+yrs.since.phd, data=salary_data)

summary(salary_lm2)
## 
## Call:
## lm(formula = salary ~ sex + faculty.rank + discipline + yrs.since.phd, 
##     data = salary_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67451 -13860  -1549  10716  97023 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           80988.47    4931.84  16.422  < 2e-16 ***
## sexMale                4349.37    3875.39   1.122  0.26242    
## faculty.rankAsstProf -13104.15    4167.31  -3.145  0.00179 ** 
## faculty.rankProf      32928.40    3544.40   9.290  < 2e-16 ***
## disciplineB           13937.47    2346.53   5.940 6.32e-09 ***
## yrs.since.phd            61.01     127.01   0.480  0.63124    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22660 on 391 degrees of freedom
## Multiple R-squared:  0.4472, Adjusted R-squared:  0.4401 
## F-statistic: 63.27 on 5 and 391 DF,  p-value: < 2.2e-16
# yrs.service still has a negative coefficient... this is not logical

# taking out faculty.rank (since rank is mostly just determined by how long they are there?)
salary_lm3 <- lm(salary ~ faculty.rank + discipline + yrs.since.phd + sex + sex*faculty.rank, data=salary_data)

summary(salary_lm3)
## 
## Call:
## lm(formula = salary ~ faculty.rank + discipline + yrs.since.phd + 
##     sex + sex * faculty.rank, data = salary_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67531 -13938  -1215  10765  96921 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  79193.33    7654.84  10.346  < 2e-16 ***
## faculty.rankAsstProf         -7848.08   10016.19  -0.784 0.433787    
## faculty.rankProf             33599.53    9015.90   3.727 0.000223 ***
## disciplineB                  14028.24    2355.32   5.956 5.79e-09 ***
## yrs.since.phd                   58.23     127.64   0.456 0.648510    
## sexMale                       6464.05    7817.86   0.827 0.408839    
## faculty.rankAsstProf:sexMale -6308.13   10839.48  -0.582 0.560932    
## faculty.rankProf:sexMale      -874.01    9603.83  -0.091 0.927535    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22710 on 389 degrees of freedom
## Multiple R-squared:  0.4478, Adjusted R-squared:  0.4379 
## F-statistic: 45.07 on 7 and 389 DF,  p-value: < 2.2e-16
plot(salary_lm3)

salary_lm4<- lm (salary~discipline+yrs.service+sex, data= salary_data)

summary(salary_lm4)
## 
## Call:
## lm(formula = salary ~ discipline + yrs.service + sex, data = salary_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -77424 -19404  -4635  15539 105391 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  84361.1     4941.4  17.072  < 2e-16 ***
## disciplineB  13033.8     2840.3   4.589 6.01e-06 ***
## yrs.service    832.2      110.2   7.550 3.07e-13 ***
## sexMale       8423.3     4744.5   1.775   0.0766 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27790 on 393 degrees of freedom
## Multiple R-squared:  0.1646, Adjusted R-squared:  0.1582 
## F-statistic: 25.81 on 3 and 393 DF,  p-value: 2.928e-15
stargazer(salary_lm2, salary_lm3, salary_lm4, type = "html")
## 
## <table style="text-align:center"><tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td colspan="3"><em>Dependent variable:</em></td></tr>
## <tr><td></td><td colspan="3" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td colspan="3">salary</td></tr>
## <tr><td style="text-align:left"></td><td>(1)</td><td>(2)</td><td>(3)</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">sexMale</td><td>4,349.366</td><td>6,464.051</td><td>8,423.335<sup>*</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(3,875.393)</td><td>(7,817.858)</td><td>(4,744.537)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">faculty.rankAsstProf:sexMale</td><td></td><td>-6,308.131</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td>(10,839.480)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">faculty.rankProf:sexMale</td><td></td><td>-874.006</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td>(9,603.828)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">faculty.rankAsstProf</td><td>-13,104.150<sup>***</sup></td><td>-7,848.084</td><td></td></tr>
## <tr><td style="text-align:left"></td><td>(4,167.315)</td><td>(10,016.190)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">faculty.rankProf</td><td>32,928.400<sup>***</sup></td><td>33,599.530<sup>***</sup></td><td></td></tr>
## <tr><td style="text-align:left"></td><td>(3,544.403)</td><td>(9,015.904)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">disciplineB</td><td>13,937.470<sup>***</sup></td><td>14,028.240<sup>***</sup></td><td>13,033.850<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(2,346.534)</td><td>(2,355.315)</td><td>(2,840.349)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">yrs.since.phd</td><td>61.011</td><td>58.228</td><td></td></tr>
## <tr><td style="text-align:left"></td><td>(127.010)</td><td>(127.640)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">yrs.service</td><td></td><td></td><td>832.154<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td>(110.215)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>80,988.470<sup>***</sup></td><td>79,193.330<sup>***</sup></td><td>84,361.070<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(4,931.844)</td><td>(7,654.839)</td><td>(4,941.371)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>397</td><td>397</td><td>397</td></tr>
## <tr><td style="text-align:left">R<sup>2</sup></td><td>0.447</td><td>0.448</td><td>0.165</td></tr>
## <tr><td style="text-align:left">Adjusted R<sup>2</sup></td><td>0.440</td><td>0.438</td><td>0.158</td></tr>
## <tr><td style="text-align:left">Residual Std. Error</td><td>22,663.240 (df = 391)</td><td>22,708.750 (df = 389)</td><td>27,789.810 (df = 393)</td></tr>
## <tr><td style="text-align:left">F Statistic</td><td>63.266<sup>***</sup> (df = 5; 391)</td><td>45.071<sup>***</sup> (df = 7; 389)</td><td>25.810<sup>***</sup> (df = 3; 393)</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td colspan="3" style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>